Research question
between 2016 and 2022 there have been major shifts in terms of majorities in the US congress:
in 2016, both chambers were hold by Republicans
in 2018, the Democrats gained a majority in Congress
in 2020 the Democrats gained Congress and Senate
While one might expect that the post 2016 and 2020 congresses will vary in their policies. However, it is interesting to also focus on the period between 2018 and 2020: When both chambers had different majorities and needed to cooperate.
We will focus on the question on whether and how the different majorities had an impact on the policies that have been passed by Congress.
Scraping and cleaning the data
We scraped our data from: https://data.gov/developers/apis/index.html
df <- read_csv("https://raw.githubusercontent.com/juka19/tad_assignment3/main/data/data_11_28.csv")## New names:
## Rows: 920 Columns: 13
## -- Column specification
## -------------------------------------------------------- Delimiter: "," chr
## (3): subjects, summary, policy_area dbl (8): ...1, ...2, ...3, Unnamed: 0, bill
## number, cosponsor_D_perc, cospo... date (2): latest_action, date
## i Use `spec()` to retrieve the full column specification for this data. i
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## * `` -> `...1`
## * `...1` -> `...2`
## * `...2` -> `...3`
df_cosp <- read_csv("https://raw.githubusercontent.com/juka19/tad_assignment3/main/data/cosponsors_sponsors.csv")## New names:
## Rows: 21448 Columns: 4
## -- Column specification
## -------------------------------------------------------- Delimiter: "," chr
## (2): cosponsor_name, cosponsor_party dbl (2): ...1, number
## i Use `spec()` to retrieve the full column specification for this data. i
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## * `` -> `...1`
df_bills <- read_csv("data\\old_data\\all_bills.csv")## New names:
## Rows: 986 Columns: 13
## -- Column specification
## -------------------------------------------------------- Delimiter: "," chr
## (6): latestAction, originChamber, originChamberCode, title, type, url dbl (5):
## ...1, index, Unnamed: 0, congress, number dttm (1): updateDateIncludingText
## date (1): updateDate
## i Use `spec()` to retrieve the full column specification for this data. i
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## * `` -> `...1`
head(df, 5)## # A tibble: 5 x 13
## ...1 ...2 ...3 `Unnamed: 0` `bill number` subjects summary policy_area
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 1 1 0 0 4996 "{'legislati~ "Bankr~ Finance an~
## 2 2 2 1 1 8906 "{'legislati~ "Lifes~ Health
## 3 3 3 2 2 8810 "{'legislati~ "Natio~ Emergency ~
## 4 4 4 3 3 8611 "{'legislati~ "Desig~ Government~
## 5 5 5 4 4 8354 "{'legislati~ "Servi~ Civil Righ~
## # ... with 5 more variables: latest_action <date>, cosponsor_D_perc <dbl>,
## # cosponsor_R_perc <dbl>, date <date>, session <dbl>
Creating party variable
#if two thirds of the sponsors are democrats, we consider the bill democrat-dominated
#same for republicans
#if there is no clear majority, they are "Both"
df$party <- ifelse(df$cosponsor_D_perc > 0.66, "Democrat", ifelse(df$cosponsor_R_perc > 0.66, "Republican", "Both"))Summary statistics
Density of cosponsors
ggplot(df, aes(x = cosponsor_D_perc)) +
geom_histogram(aes(y=..density..), colour="black", fill="white") +
geom_density(alpha=.1, fill="blue") +
labs(title="Density of bill cosposor party",
x ="Cosponsor party composition", y = "Density",
caption = "Numbers represent proportion of cosponsors from Democratic party,
so 0.0 represents bills that were fully Republican and 1.0 represents
bills that were fully Democrat.") +
theme_minimal()## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Policy fields
fromJSON(file = "slides\\plots\\polar_line_plot") %>% plotly::as_widget()## A marker object has been specified, but markers is not in the mode
## Adding markers to the mode...
## A marker object has been specified, but markers is not in the mode
## Adding markers to the mode...
## A marker object has been specified, but markers is not in the mode
## Adding markers to the mode...
Creating a corpus
df_corp <- df
df_corp <- df_corp %>% rename(text = summary)
corp <- corpus(df_corp)## Warning: NA is replaced by empty string
Creatung a dfm from the corpus
dfmat <- corp %>%
tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>%
tokens_remove(patter = stopwords("en")) %>%
tokens_replace(pattern = lexicon::hash_lemmas$token, replacement = lexicon::hash_lemmas$lemma) %>%
tokens_wordstem() %>%
tokens_remove(c("sec","bill","act", "section", "funds", "shall","must", "use", "author","fund","provid","program","requir","divis","titl","appropri","specifi")) %>%
dfm()Wordclouds
Most common words in all congresses
dfmatCon <- dfm(corp, remove = stopwords("english"), remove_numbers = TRUE, remove_punct = TRUE, groups = corp$session) %>%dfm_remove(c("sec","bill","act", "section", "funds", "shall","must", "used")) %>%
dfm_trim(min_termfreq = 3)
textplot_wordcloud(dfmatCon, comparison = TRUE, max_words = 300,
color = c("blue", "red"))#Wordcloud congress 115Comparing the 115th and 116th congress
dfmat_115 <- dfm_subset(dfmat, session == 115)
corp_115 <- df %>% filter(session == 115) %>% rename(text = summary) %>% corpus()
modelpart15 <- dfm(corp_115, remove = stopwords("english"),remove_numbers = TRUE, remove_punct = TRUE, groups = corp_115$party) %>%
dfm_remove(c("sec","bill","act", "section", "funds", "shall","must", "used")) %>%
dfm_trim(min_termfreq = 3)
mp15 <- textplot_wordcloud(modelpart15, comparison = TRUE, max_words = 300,
color = c("green","blue", "red"))corp_116 <- df %>% filter(session == 116) %>% rename(text = summary) %>% corpus()
modelpart16 <- dfm(corp_116, remove = stopwords("english"), remove_numbers = TRUE, remove_punct = TRUE, groups = corp_116$party) %>% dfm_remove(c("sec","bill","act", "section", "funds", "shall","must", "used")) %>%
dfm_trim(min_termfreq = 3)
mp16 <- textplot_wordcloud(modelpart16, comparison = TRUE, max_words = 300,
color = c("green","blue", "red"))Dimensionality plotting
corp2 <- corpus(df$summary)## Warning: NA is replaced by empty string
dfmat2 <- corp2 %>%
tokens(remove_punct = TRUE) %>%
tokens_remove(patter = stopwords("en")) %>%
dfm() %>%
dfm_trim(min_termfreq = 5)
embeddings <- umap(as.matrix(dfmat2))
df$x <- embeddings[,1]
df$y <- embeddings[,2]
colordict <- c( "Democrat"="blue","Republican"="red", "Both"="yellow")
p <- ggplot(df, aes(x, y, fill=party)) +
geom_point(color="grey", shape=21, size=0.5) +
scale_fill_manual(values=colordict) +
theme_bw()
p <- ggplotly(p)
pdf1 <- df %>%
mutate(party_full = ifelse(cosponsor_D_perc == 1.0, "Dem",
ifelse(cosponsor_R_perc == 1.0, "Rep", NA))) %>%
drop_na(party_full)
corp3 <- corpus(df1$summary)
dfmat3 <- corp3 %>%
tokens(remove_punct = TRUE) %>%
tokens_remove(patter = stopwords("en")) %>%
dfm() %>%
dfm_trim(min_termfreq = 5)
embeddings2 <- umap(as.matrix(dfmat3))
df1$x <- embeddings2[,1]
df1$y <- embeddings2[,2]
colordict2 <- c( "Democrat"="blue","Republican"="red")
j <- ggplot(df1, aes(x, y, fill=party)) +
geom_point(color="grey", shape=21, size=0.5) +
scale_fill_manual(values=colordict2) +
theme_bw()
j <- ggplotly(j)
j##Sentiment analysis
summary_sentiment <- read_csv("https://raw.githubusercontent.com/juka19/tad_assignment3/main/data/data_w_vader.csv")## New names:
## Rows: 920 Columns: 18
## -- Column specification
## -------------------------------------------------------- Delimiter: "," chr
## (3): subjects, summary, policy_area dbl (13): ...1, Unnamed: 0, ...3, ...4,
## Unnamed: 0.1, bill number, cosponso... date (2): latest_action, date
## i Use `spec()` to retrieve the full column specification for this data. i
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## * `` -> `...1`
## * `...1` -> `...3`
## * `...2` -> `...4`
summary_sentiment$party <- ifelse(summary_sentiment$cosponsor_D_perc > 0.66, "Democrat", ifelse(summary_sentiment$cosponsor_R_perc > 0.66, "Republican", "Both"))
wide_sentiment <- summary_sentiment %>%
group_by(party, date) %>%
summarise(score = mean(compound)) %>%
pivot_wider(names_from = party, values_from = score) %>%
select(-c("Both", "NA"))## `summarise()` has grouped output by 'party'. You can override using the
## `.groups` argument.
days <- data.frame(date = seq(as.Date("2017-01-01"),as.Date("2022-12-31"),1))
daily_sentiment <- days %>%
left_join(wide_sentiment) %>%
pivot_longer(cols = -date, names_to="party", values_to="score")## Joining, by = "date"
p4 <- ggplot(daily_sentiment, aes(x=date, y = score, colour=party)) +
geom_point(aes(y=score), size=1) +
theme_minimal() +
geom_smooth(method = "loess", se = FALSE)+
scale_color_manual(values = c("blue","red"))
ggplotly(p4)## `geom_smooth()` using formula 'y ~ x'
Network graph of 116 House members
df_labels <- df_cosp %>%
inner_join(
df,
by = c("number"= "bill number")
) %>%
inner_join(
df_bills %>% select(number, originChamber),
by = 'number'
) %>%
filter(session == 116, originChamber == 'House') %>%
mutate(
cosponsor_name = str_replace_all(cosponsor_name, '\\w\\.\\s', ''),
cosponsor_name = str_replace_all(tolower(cosponsor_name), " ", "_"))
df_labels %>%
group_by(number) %>%
summarize(cosps = paste(cosponsor_name, collapse = " ")) -> df_grouped
corpus(df_grouped$cosps) %>%
tokens() %>%
fcm() %>%
as.matrix() %>%
graph_from_adjacency_matrix(mode = "undirected", weighted = T) %>%
toVisNetworkData() -> visn
visn$edges$value <- visn$edges$weight
edges <- data.frame(visn$edges) %>%
filter(weight > 25)
nodes <- data.frame(visn$nodes) %>%
left_join(df_labels %>% select(cosponsor_name, cosponsor_party) %>% distinct(), by = c('label' = 'cosponsor_name')) %>%
filter(!duplicated(label)) %>%
rename('group' = 'cosponsor_party') %>%
mutate(color.background = if_else(group == "R", "darkred", "darkblue"),
shadow = T,
font.color = 'black',
borderWidth = 2) %>%
filter(label %in% edges$from | label %in% edges$to) %>%
mutate(label = str_replace_all(label, '_', ' ') %>% tools::toTitleCase(),
title = label)
network <- visNetwork(nodes, edges) %>%
visNodes(color = list(border = "darkgrey",
borderWidth = 2,
highlight = list(border = 'darkgrey', background = 'orange'))
) %>%
visEdges(color = list(color = '#d3d3d3', highlight = 'orange')) %>%
visPhysics(maxVelocity = 3)
networkTopic modeling
umap <- read.csv('data\\umap_projection_data.csv')
umap %>%
ggplot(aes(x, y, color = cosponsor_D_perc,
text = paste('Title:' , title,
'<br>Policy Area: ', policyarea,
'<br>Session: ', session,
'<br>Topic: ', cluster))) +
geom_point() +
scale_color_gradient2(midpoint = .5, name = "Democrat share") +
theme_minimal() +
labs(x = '', y = '', title = "UMAP Projection of US Congress bill summaries 115-117") +
theme(panel.background = element_rect(fill='#fbfbfb', colour='#fbfbfb')) -> p
ggp <- ggplotly(p)
ggpConclusion
- In general the legilsation appears quite similar, with exceptions limited to specific policy areas and topics
- Democratic and Republican cooperation across the different congress sessions is highly dependent on the policy area and topic of the bill
- There are some key actors that enable working across the aisle and are known for bipartisan collaboration